{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ed0697d9-1199-4f04-9c7f-a3fd56551227",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----!"
     ]
    }
   ],
   "source": [
    "#Sagemaker Endpoint Deploy\n",
    "from sagemaker.huggingface import HuggingFaceModel\n",
    "import sagemaker\n",
    "\n",
    "role = sagemaker.get_execution_role()\n",
    "# Hub Model configuration. https://huggingface.co/models\n",
    "hub = {\n",
    "\t#'HF_MODEL_ID':'shibing624/text2vec-base-chinese',\n",
    "    'HF_MODEL_ID':'shibing624/text2vec-large-chinese',\n",
    "\t'HF_TASK':'feature-extraction'\n",
    "}\n",
    "\n",
    "# create Hugging Face Model Class\n",
    "huggingface_model = HuggingFaceModel(\n",
    "\ttransformers_version='4.17.0',\n",
    "\tpytorch_version='1.10.2',\n",
    "\tpy_version='py38',\n",
    "\tenv=hub,\n",
    "\trole=role, \n",
    ")\n",
    "\n",
    "# deploy model to SageMaker Inference\n",
    "predictor = huggingface_model.deploy(\n",
    "\tendpoint_name='huggingface-inference-text2vec-base-chinese-v1',\n",
    "\tinitial_instance_count=1, # number of instances\n",
    "\tinstance_type='ml.m5.xlarge' # ec2 instance type\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3454cf6e-a6ff-4937-9d79-11f38ed388d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2672 out of 2673 finished"
     ]
    }
   ],
   "source": [
    "#Preprocess Data\n",
    "import pandas as pd\n",
    "import sagemaker\n",
    "from sagemaker.huggingface import HuggingFaceModel\n",
    "import json\n",
    "import boto3\n",
    "import requests\n",
    "import json\n",
    "import numpy as np\n",
    "\n",
    "def cosine_similarity(vector1, vector2):\n",
    "    dot_product = np.dot(vector1, vector2)\n",
    "    norm_vector1 = np.linalg.norm(vector1)\n",
    "    norm_vector2 = np.linalg.norm(vector2)\n",
    "    similarity_score = dot_product / (norm_vector1 * norm_vector2)\n",
    "    return similarity_score\n",
    "\n",
    "def get_vector(q):\n",
    "    return hfp.predict({'inputs':[q]})[0][0][0]\n",
    "\n",
    "hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')\n",
    "\n",
    "\n",
    "\n",
    "path = 'material.csv'\n",
    "df = pd.read_excel(path)\n",
    "df['question_vector'] = ''\n",
    "data = df['question'].tolist()\n",
    "for i in range(len(df)):\n",
    "    df.iloc[i, 2] = str(get_vector(df.iloc[i, 0]))\n",
    "    print('\\r%i out of %i finished'%(i, len(df)), end='')\n",
    "\n",
    "test = \"我想去米亚罗\"\n",
    "test_embedding = get_vector(test)\n",
    "q_array = [\"我很喜欢米亚罗的风景\",\"我去过米亚罗\",\"我有意向去米亚罗旅游\",\"米亚罗下雪了\"]\n",
    "for q in q_array:\n",
    "    q_embeding=get_vector(q)\n",
    "    score = cosine_similarity(test_embedding,q_embeding)\n",
    "    print(f\"{}, {} ,simlarity:{}\".format(test_embedding,q_embeding,score))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_pytorch_p39",
   "language": "python",
   "name": "conda_pytorch_p39"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
